library(knitr)
library(readr)
library(tidyverse)
library(ggplot2)
library(purrr)
library(broom)
library(gridExtra)
library(tidyverse)
library(broom)
library(mice)
library(GGally)
source("../R/ExtractRecordall.R")
source("../R/importance.R")
source("../R/rest.R")

Extracting data

# loading data
Federer <- extract_records_all("../data/2017-ausopen-matches.csv", 
                               "../data/2017-ausopen-points.csv", "Roger Federer")
Nadal = extract_records_all("../data/2017-ausopen-matches.csv",
                            "../data/2017-ausopen-points.csv", "Rafael Nadal")
Alex =extract_records_all("../data/2017-ausopen-matches.csv",
                          "../data/2017-ausopen-points.csv", "Alexander Zverev")
Raonic <- extract_records_all("../data/2017-ausopen-matches.csv",
                          "../data/2017-ausopen-points.csv", "Milos Raonic")
Kerber<- extract_records_all("../data/2017-ausopen-matches.csv", 
                               "../data/2017-ausopen-points.csv", "Angelique Kerber")
Williams_v<- extract_records_all("../data/2017-ausopen-matches.csv",
                            "../data/2017-ausopen-points.csv", "Venus Williams")
Wozniacki <- extract_records_all("../data/2017-ausopen-matches.csv",
                          "../data/2017-ausopen-points.csv", "Caroline Wozniacki")
Williams_s<- extract_records_all("../data/2017-ausopen-matches.csv",
                              "../data/2017-ausopen-points.csv", "Serena Williams")

Group data

# create a list of dataset interested 
player_dt<- list(Federer, Nadal, Alex, Raonic, Kerber, Wozniacki, Williams_v, Williams_s)
# create a list of player's name
name <- list("Roger Federer","Rafael Nadal", "Alexander Zverev","Milos Raonic", "Angelique Kerber", "Caroline Wozniacki", "Serena Williams", "Venus Williams")
female = list("Angelique Kerber", "Caroline Wozniacki", "Serena Williams", "Venus Williams")
male = list("Roger Federer","Rafael Nadal", "Alexander Zverev","Milos Raonic")

Dealing with missing data

Clean data

clean_data<- function(dt, name) {
  dt %>% 
    mutate(name = name) %>% 
    mutate(rest = as.factor(rest(dt))) %>% 
    mutate(impt = point_impt(dt)) %>%
    mutate(RallyCount = log(RallyCount)) %>% 
    # filter the double fault points
    filter(Speed_KMH != 0) %>%  
    # filter the point serve by the player interested 
    filter(ServeIndicator == ifelse(player1 == name, 1,2)) %>% 
    dplyr::select(PointNumber,impt,dist,cum_dist, rest,time,MatchNo,SetNo, 
           ServeNumber, name, Speed_KMH, cum_time, RallyCount, Gender)
}


# clean the palyer_dt
player_dt_clean <- map2_df(player_dt_miss, name, clean_data)

cum_dist_resid <- augment(lm(cum_dist~PointNumber, data = player_dt_clean))$.resid
cum_time_resid<- augment(lm(cum_time~PointNumber, data = player_dt_clean))$.resid

player_dt_clean <- player_dt_clean %>% mutate(cum_dist_resid = cum_dist_resid) %>% 
  mutate(cum_time_resid = cum_time_resid)

Model

# linear model 
fit_lm <- function(data) lm(Speed_KMH~ 
                              PointNumber+ impt + time + dist + rest + MatchNo + RallyCount +
                              PointNumber * MatchNo + cum_dist_resid + cum_time_resid,
                            data = data)


build_linear_model <- function(dt, fit){
  by_player_fit <- dt %>% 
    group_by(name) %>% 
    nest() %>% 
    mutate(model = map(data, fit)) 
  return(by_player_fit)
}

fetch_coef <- function(dt){
  player_coef_fit <- dt %>%
    unnest(model %>% map(tidy)) %>%
    dplyr::select(name, term,estimate) %>%
    spread(term, estimate)
}

# split by first and second serve 
player_dt_serve <- split(player_dt_clean,player_dt_clean$ServeNumber)
firstServe <- player_dt_serve[[1]] 
SecondServe <- player_dt_serve[[2]]

# model for first serve for all four players
firstserve_lm <- build_linear_model(firstServe, fit_lm) %>% 
  mutate(servenumber =as.factor(1)) 
firstserve_coef <- firstserve_lm %>% fetch_coef()
firstserve_aug_lm <- firstserve_lm %>% unnest(model %>% map(augment))
firstserve_fit_lm <- firstserve_lm %>% unnest(model%>% map(glance))


# model for first serve for all four players
secondserve_lm <- build_linear_model(SecondServe, fit_lm) %>%
  mutate(servenumber =as.factor(2))
secondserve_coef <- fetch_coef(secondserve_lm)
secondserve_aug_lm <- secondserve_lm %>% unnest(model %>% map(augment))
secondserve_fit_lm<- secondserve_lm %>% unnest(model%>% map(glance))

# augment first and second serve data together 
model_lm<- rbind(firstserve_aug_lm, secondserve_aug_lm) 
coefficient <- rbind(firstserve_coef,secondserve_coef)%>% 
  mutate(servenumber = c(1,1,1,1,1,1,1,1,
                         2,2,2,2,2,2,2,2))

Model Diagnostic

Predicted vs. Actual

This graph contrasts the player’s serving speed with the predicted value. The blue line has slope of 1, which is the position where a perfect prediction lays. The two clusters indicates the effectiveness of prediction when seperating the first and second serve in modelling. For Federer, Nadal and Zeverev, the difference between first and second serve is clear as shown in mainly two clusters, while Raonic doesn’t seem to have clear cut between the first and second serve speed.

Plot explanatory variables against each other

Model Summary: R.squared

name 1 2
Alexander Zverev 0.1169831 0.2649063
Angelique Kerber 0.3439414 0.5843132
Caroline Wozniacki 0.4012296 0.2175152
Milos Raonic 0.1303058 0.1140851
Rafael Nadal 0.1265345 0.1503419
Roger Federer 0.0608728 0.1226519
Serena Williams 0.1235949 0.3698460
Venus Williams 0.1759242 0.3598298

Check significance of variables

Firstserve

term Alexander Zverev Angelique Kerber Caroline Wozniacki Milos Raonic Rafael Nadal Roger Federer Serena Williams Venus Williams
(Intercept) 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
cum_dist_resid 0.24 0.10 0.07 0.83 0.05 0.88 0.77 0.16
cum_time_resid 0.20 0.01 0.00 0.01 0.07 0.80 0.19 0.48
dist 0.27 0.66 0.16 0.04 0.84 0.14 0.88 0.15
impt 0.16 0.07 0.83 0.71 0.45 0.00 0.30 0.83
MatchNo2 0.61 0.18 0.01 0.55 0.09 0.48 0.81 0.01
MatchNo3 0.03 0.55 0.98 0.85 0.56 0.36 0.23 0.02
MatchNo4 NA 0.70 NA 0.30 0.03 0.65 0.50 0.01
MatchNo5 NA NA NA 0.10 0.10 0.77 0.66 0.12
MatchNo6 NA NA NA NA 0.36 0.70 0.70 0.01
MatchNo7 NA NA NA NA 0.18 0.60 0.90 0.02
PointNumber 0.26 0.02 0.01 0.09 0.01 0.52 0.14 0.70
PointNumber:MatchNo2 0.99 0.02 0.00 0.62 0.93 0.29 0.78 0.11
PointNumber:MatchNo3 0.48 0.05 0.00 0.35 0.25 0.76 0.33 0.21
PointNumber:MatchNo4 NA 0.04 NA 0.15 0.15 0.23 0.79 0.90
PointNumber:MatchNo5 NA NA NA 0.05 0.04 0.90 0.98 0.76
PointNumber:MatchNo6 NA NA NA NA 0.03 0.70 0.46 0.01
PointNumber:MatchNo7 NA NA NA NA 0.08 0.93 0.91 0.15
RallyCount 0.00 0.00 0.01 0.03 0.00 0.08 0.99 0.10
rest1.5 0.15 0.76 0.02 0.11 0.38 0.66 0.28 0.62
rest2 0.25 0.91 NA 0.03 0.06 0.55 0.68 0.73
time 0.19 0.67 0.40 0.71 0.00 0.70 0.48 0.43

Secondserve

term Alexander Zverev Angelique Kerber Caroline Wozniacki Milos Raonic Rafael Nadal Roger Federer Serena Williams Venus Williams
(Intercept) 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
cum_dist_resid 0.96 0.12 0.64 0.87 0.96 0.68 0.04 0.59
cum_time_resid 0.93 0.94 0.68 0.48 0.96 0.52 0.77 0.42
dist 0.18 0.02 0.10 0.63 0.84 0.21 0.24 0.19
impt 0.57 0.27 0.85 0.30 0.73 0.28 0.27 0.14
MatchNo2 0.17 0.00 0.37 0.58 0.07 0.20 0.31 0.12
MatchNo3 0.95 0.03 0.23 0.75 0.87 0.83 0.68 0.58
MatchNo4 NA 0.90 NA 0.37 0.73 0.50 0.68 0.20
MatchNo5 NA NA NA 0.73 0.93 0.95 0.04 0.01
MatchNo6 NA NA NA NA 0.99 0.64 0.00 0.75
MatchNo7 NA NA NA NA 0.80 0.13 0.06 0.37
PointNumber 0.56 0.03 0.79 0.49 0.84 0.71 0.08 0.67
PointNumber:MatchNo2 0.17 0.26 0.84 0.45 0.44 0.35 0.65 0.17
PointNumber:MatchNo3 0.88 0.01 0.93 0.60 0.76 0.52 0.07 0.32
PointNumber:MatchNo4 NA 0.34 NA 0.42 0.96 0.79 0.61 0.33
PointNumber:MatchNo5 NA NA NA 0.57 0.98 0.78 0.12 0.23
PointNumber:MatchNo6 NA NA NA NA 0.84 1.00 0.03 0.92
PointNumber:MatchNo7 NA NA NA NA 0.58 0.67 0.13 0.20
RallyCount 0.42 0.25 0.59 0.09 0.14 0.01 0.78 0.07
rest1.5 0.89 0.66 0.79 0.82 0.44 0.81 0.93 0.32
rest2 0.25 NA 0.74 0.97 0.97 0.94 0.36 0.98
time 0.31 0.97 0.82 0.50 0.07 0.50 0.43 0.46

correlation

The correlation between Point Number and cummulated match time is high and the plot shows the relationship is linear. Thus it doesn’t matter much to use point number or match time as x variable when plotting the models.

## [1] 0.9863859

Visualisation

Point Number

name 1 2
Alexander Zverev 0.1219240 -0.0599029
Angelique Kerber -0.3784410 -0.2051566
Caroline Wozniacki -0.4652797 0.0893456
Milos Raonic 0.2120854 0.1238700
Rafael Nadal -0.1679667 0.0175697
Roger Federer -0.0424842 0.0292872
Serena Williams -0.0779580 0.1021910
Venus Williams -0.0713049 0.0707765

For all players, there’s significant difference between first and second serve speed. We can also find that on average, Male’s serving is significantly faster than female’s.

For the first serve, Zeverev and Raonic have their serving speeds mostly above 175 KMH, which are much higher than those of Nadal or Federer’s. Notice that Nadal does have a few fast serve at around 200 KMH.

For the second serve, Zeverev shows clear evidence of reducing of speed for each match and this could be due to the fact of his young age, thus lack of experience or fatigue. For Raonic, we can find some evidence of reduce of serving speed but the variation of serve varies a lot for each match. For Nadal and Federer, whose serving speed is relatively consistent across game, fatigue can be captured by the variation of the serving speed.

Looking at female’s data, we could see that Serena and Venus Williams, who played the final game shows a relatively stable serving speed like Federer and Nadal. While Kerber has a similar high variation of serving speed like Raonic and Wozniakic’s first serve seems to increase as the game proceeds.

Based on these, we could capture the fatigue through the reduce of the serving speed (slope) as well as the variation of the serving speed (variance). Attention need to be paid to players like Raonic, whose serving speed naturally variates a lot in each match and Wozniakic, whose first serve seems to go against our hypothesis that serving speed will decrease as the game proceeds.

Running Distance

name 1 2
Alexander Zverev 0.0947309 -0.1325152
Angelique Kerber -0.0581313 -0.1912024
Caroline Wozniacki -0.1298826 0.2456916
Milos Raonic -0.2356662 -0.0928441
Rafael Nadal 0.0091780 -0.0145209
Roger Federer -0.1165277 -0.1356490
Serena Williams -0.0159290 0.1159422
Venus Williams -0.2304942 0.1787765

Running distance in general doesn’t seem to affect much on the male’s serving speed, although little evidence (Nadal’s second serve) supports that it may reduce the second serving speed.

While for female, running distance seems to be an increasing factor of the second serving speed. We can see that Wozniaki’s “seemingly increasing serving speed” in the previous graph is due to the increase of second serve speed. Serena also exhibit this pattern in the second serve.

Another thing to notice is that due to the nature of the female’s game (3 games a match rather than 5 games as male’s). We observe less data for female than male, thus we would expect female data to have higher variation (i.e. Kerber and Venus)

Point Importance

name 1 2
Alexander Zverev 36.794522 -15.384869
Angelique Kerber -61.497941 -21.759364
Caroline Wozniacki 9.605531 -13.445716
Milos Raonic 14.244861 -53.816542
Rafael Nadal 10.098873 6.369603
Roger Federer 57.017134 -27.113367
Serena Williams 24.504677 31.449178
Venus Williams -7.872917 -56.912844
  • In general, point importance doesn’t affect the serving speed very much.
  • However, as point becomes more importance, player’s second serve speed tends to decrease i.e. Raonic and Kerber, which can be viewed as a conservative approach taken by players.
  • Also, its interesting to notice that Federer’s first serve, serving speeds increase as points become more importance, which shows a more aggressive approach taken by him trying to win the point via an ace serve. This pattern also shows in Serena’s second serve

Rest

name 1 2
Alexander Zverev 7.705043 0.6373962
Angelique Kerber -2.430421 1.8701346
Caroline Wozniacki -16.059879 -2.9245111
Milos Raonic 10.287678 1.4194872
Rafael Nadal 2.117839 2.9106437
Roger Federer -1.506890 -1.1257766
Serena Williams 6.707852 -0.3783389
Venus Williams -3.473431 7.2900524
name 1 2
Alexander Zverev 14.774285 -8.2075095
Angelique Kerber -2.162510 NA
Caroline Wozniacki NA -3.5098499
Milos Raonic 18.376758 -0.5150704
Rafael Nadal 7.727629 0.1813265
Roger Federer 3.800889 -0.3933583
Serena Williams -5.488399 5.3713929
Venus Williams -2.714327 -0.1430077

In general, after having the game break, players tends to have higher serving speed, which indicates less fatigue. The improvement for Nadal is marginal while it is more obvious in Zverev’s first serve and Federer’s Second serve. Raonic’s behaviour is interesting in a sense that after each game break, his serving usually drop (green dots) while after the set break, he would have faster serving (blue dotss)

In female players, Kerber and Wozniaki behave similarly with a decrease of serving speed after the scheduled break. While for Serena Williams, she has similar behaviour to top male players with marginal increase of serving speed after the breaks

Time

name 1 2
Alexander Zverev -0.0778764 0.0415548
Angelique Kerber -0.0430758 -0.0020048
Caroline Wozniacki 0.0601028 0.0271098
Milos Raonic 0.0188585 0.0323112
Rafael Nadal -0.0546811 -0.0625478
Roger Federer 0.0140779 0.0324863
Serena Williams -0.0424071 0.0309649
Venus Williams 0.0499157 -0.0343461

In general, as the point is played longer, player’s serving speed will decrease marginally (i.e. Nadal). However, for Raonic, it seems to have a positive effect on the serving speed

Rally Count

name 1 2
Alexander Zverev -3.8048180 -0.8309095
Angelique Kerber -4.6230006 -0.9543669
Caroline Wozniacki -2.8613005 -1.2217676
Milos Raonic -3.2077052 -3.4552284
Rafael Nadal -1.9195151 -1.1165007
Roger Federer -1.3863793 -2.9550029
Serena Williams 0.0131586 -0.2913815
Venus Williams -2.2717852 -2.1622715

From the first plot, Raonic and Federer play relatively fewer long rally points, while Zverev and Nadal has more long rally points, which may help to understand if rally count would have an effect on fatigue (Serving speed)

The number of rally played in each game is also a factor that would decrease the serving speed and the effect is obvious for the first serve of Zeverev, Kerber, Venus Williams . It is interesting to know that although Nadal and Serena Williams have played a relative number of long rally game, it doesnt seem to affect his serving speed much.

cum_dist_resid

name 1 2
Alexander Zverev -0.0241589 0.0009288
Angelique Kerber 0.0487538 0.0282494
Caroline Wozniacki 0.0565705 -0.0254181
Milos Raonic -0.0063095 -0.0079096
Rafael Nadal 0.0271139 -0.0010073
Roger Federer 0.0023594 0.0077291
Serena Williams 0.0083055 -0.0617074
Venus Williams -0.0963621 0.0400308

cum_time_resid

name 1 2
Alexander Zverev 1.0283805 -0.0674622
Angelique Kerber -3.6751248 0.0641342
Caroline Wozniacki -5.3762613 -0.8874875
Milos Raonic 1.3960296 0.6083765
Rafael Nadal -0.6340924 -0.0236971
Roger Federer -0.1089691 0.3334647
Serena Williams -1.0976845 -0.2455470
Venus Williams -1.1073221 -1.1804808